{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing of PCNet PPI network\n",
    "In this notebook, I'll preprocess the [PCNet network](https://www.sciencedirect.com/science/article/pii/S2405471218300954) from [NDEx network exchange](http://www.ndexbio.org/#/networkset/e8ebbdde-86dc-11e7-a10d-0ac135e8bacf?accesskey=7fbd23635b798321954e66c63526c46397a3f45b40298cf43f22d07d4feed0fa).\n",
    "The network comes in the strange cx file format but can be preprocessed using json and similar modules."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import json\n",
    "import networkx as nx\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded PCNet PPI network with 19781 nodes and 2724724 edges\n"
     ]
    }
   ],
   "source": [
    "pcnet_file = '../../../data/networks/pcnet.cx'\n",
    "with open(pcnet_file) as cx_f:\n",
    "    all_network_components = json.load(cx_f)\n",
    "\n",
    "pcnet_nodes = None\n",
    "pcnet_edges = None\n",
    "for comp in all_network_components:\n",
    "    if 'nodes' in comp:\n",
    "        pcnet_nodes = comp['nodes']\n",
    "    if 'edges' in comp:\n",
    "        pcnet_edges = comp['edges']\n",
    "print (\"Loaded PCNet PPI network with {} nodes and {} edges\".format(len(pcnet_nodes), len(pcnet_edges)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Source</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>6239</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>7486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>6851</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Source  Target\n",
       "0       0       1\n",
       "1       0       2\n",
       "2       0    6239\n",
       "3       0    7486\n",
       "4       0    6851"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get DataFrames for edges and nodes\n",
    "edgelist = pd.DataFrame(pcnet_edges).drop('@id', axis=1)\n",
    "edgelist.columns = ['Source', 'Target']\n",
    "nodes = pd.DataFrame(pcnet_nodes).drop('@id', axis=1)\n",
    "nodes.columns = ['Name', 'hgnc_symbol']\n",
    "\n",
    "# join edgelist with the node names to have meaningful node names\n",
    "edgelist_names_source = edgelist.join(nodes.drop('hgnc_symbol', axis=1), on='Source')\n",
    "edgelist_names_source.columns = ['Source', 'Target', 'Source_Name']\n",
    "edgelist_names = edgelist_names_source.join(nodes.drop('hgnc_symbol', axis=1), on='Target')\n",
    "edgelist_names.columns = ['Source', 'Target', 'Source_Name', 'Target_Name']\n",
    "edgelist_names.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get networkx graph from edgelist\n",
    "G = nx.from_pandas_edgelist(edgelist_names, source='Source_Name', target='Target_Name')\n",
    "G.number_of_nodes(), G.number_of_edges(), nx.is_connected(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "edgelist_names.to_csv('../../../data/networks/pcnet_edgelist.tsv.gz', compression='gzip', sep='\\t')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
